#-*- coding=utf-8 -*-

from __future__ import unicode_literals
import jieba
import jieba.analyse
import jieba.posseg
import sys
sys.path.append("../")
reload(sys)
sys.setdefaultencoding('utf-8')

# 加载敏感词库
def load_dict(filename = "./utils/textprocessing/deal/mgc.txt"):
    word_dict = list()
    f = open(filename)
    for line in f:
        word = line.strip()
        word_dict.append(word)
    return word_dict

# 敏感词提示
def dealSensitiveWord(text=""):
    dicmgc = load_dict()
    # strWord = ""
    count = 0
    strJson = "{"
    # 将敏感词库的内容添加为自定义词典
    # 将str转化为unicode类型
    for n in dicmgc:
        jieba.suggest_freq(unicode(n, "utf-8"), True)

    # 搜索模式返回词语在原文的起止位置
    result = jieba.tokenize(text, mode='search')

    for w in result:
        for n in dicmgc:
            if w[0].encode('utf-8') == n:
                 count += 1
                 #strJson += "\""+str(count)+"\":{\"sensitiveWord\":\""+str(w[0])+"\",\"start\":"+str(w[1])+",\"end\":"+str(w[2])+"},"
                 strJson += '"'+str(count)+'":{"sensitiveWord":"'+str(w[0])+'","start":'+str(w[1])+',"end":'+str(w[2])+'},'
    return strJson[:-1]+"}"

# 分词
def dealcut(text=""):
    textlist = jieba.cut(text, cut_all=False)
    return "/ ".join(textlist)

# 关键词提取
def dealKeyWord(text=""):
    strKeyWord = ""
    keyWordList = jieba.analyse.textrank(text, withWeight=True)
    for w,m in  keyWordList:
        strKeyWord += w+"\t"
    return strKeyWord

# 词性注解
def dealPartOfSpeech(text=""):
    strWords = ""
    words = jieba.posseg.cut(text)
    for word, flag in words:
        strWords += word+"\t"+flag+"\n"
    return strWords
